Load Libraries

library(reshape2)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(scater)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'scater'
## The following objects are masked from 'package:plotly':
## 
##     arrange, filter, mutate, rename
## The following object is masked from 'package:stats':
## 
##     filter
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:scater':
## 
##     arrange, filter, mutate, rename
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(corrplot)
## corrplot 0.84 loaded
library(rpart)

Introduction to Dataset

Data is from an one level of an online geography tutoring system used by 5th grade students. The game involves:

A pre-test of geography knowledge (pre.test)

A series of assignments for which you have the average score (av.assignment.score)

The number of messages sent by each student to other students about the assignments (messages)

The number of forum posts students posted asking questions about the assignment (forum.posts)

A post test at the end of the level (post.test)

Whether or not the system allowed the students to go on to the next level (level.up).

Upload data

D1 = read.csv('online.data.csv')

Visualization

# Start by creating histograms of the distributions for all variables (#HINT: look up "facet" in the ggplot documentation)
# I plot them in one figure with "multiplot" from package "scater"
hp1 <- ggplot(D1, aes(x=post.test.score)) + geom_histogram(binwidth=0.1,colour="white") + facet_grid(~level.up)

hp2 <- ggplot(D1, aes(x=pre.test.score)) + geom_histogram(binwidth=0.1,colour="white") + facet_grid(~level.up)

hp3 <- ggplot(D1, aes(x=messages)) + geom_histogram(binwidth=10,colour="white") + facet_grid(~level.up)

hp4 <- ggplot(D1, aes(x=forum.posts)) + geom_histogram(binwidth=3,colour="white") + facet_grid(~level.up)

hp5 <- ggplot(D1, aes(x=av.assignment.score)) + geom_histogram(binwidth=0.05,colour="white") + facet_grid(~level.up)

multiplot(hp1, hp2, hp3, hp4, hp5, cols = 3)

# Then visualize the relationships between variables with correlogram plot after removing id and converting level.up to numbers.
D2 = select(D1, -c(id))
D2$level.up = as.character(D2$level.up)
D2$level.up[D2$level.up == "no"] = 0
D2$level.up[D2$level.up == "yes"] = 1
D2$level.up = as.numeric(D2$level.up)
COR <- cor(D2)

corrplot(COR, order="AOE", method="circle", tl.pos="lt", type="upper",        
tl.col="black", tl.cex=0.6, tl.srt=45, 
        addCoef.col="black", addCoefasPercent = TRUE,
        sig.level=0.50, insig = "blank")

Try to capture an intution about the data and the relationships

First, if spilit data on level.up (yes or no), the distribution of all other variables show different patterns. At a glance, the “yes” group has higher post test score, # of messages, average assignment score, pre test score, and # of forum posts than the “no” group.

Second, if looking at outcome variables (post test score and level.up), they are more correlated with average assignment scores and # of messages. Particularly, # of messages and post test score are strongly correlated.

Classification tree

#Create a classification tree that predicts whether a student "levels up" in the online course using three variables of your choice (As we did last time, set all controls to their minimums)

# Model 1, I use # of messages, # of forum posts and pre test score.
c.tree1 <- rpart(level.up ~ messages + forum.posts + pre.test.score, method="class", data=D1, control=rpart.control(minsplit=2, minbucket=1))

#Plot and generate a CP table for your tree 
printcp(c.tree1)
## 
## Classification tree:
## rpart(formula = level.up ~ messages + forum.posts + pre.test.score, 
##     data = D1, method = "class", control = rpart.control(minsplit = 2, 
##         minbucket = 1))
## 
## Variables actually used in tree construction:
## [1] messages       pre.test.score
## 
## Root node error: 400/1000 = 0.4
## 
## n= 1000 
## 
##        CP nsplit rel error xerror     xstd
## 1 0.54250      0    1.0000 1.0000 0.038730
## 2 0.01125      1    0.4575 0.4575 0.030569
## 3 0.01000      3    0.4350 0.4750 0.031014
post(c.tree1, file = "tree1.ps", title = "Level up: 1 - YES, 0 - NO")

#Generate a probability value that represents the probability that a student levels up based your classification tree

D1$pred <- predict(c.tree1, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on.

#Plot the ROC curve
pred.detail <- prediction(D1$pred, D1$level.up)
plot(performance(pred.detail, "tpr", "fpr"))
abline(0, 1, lty = 2)

#Calculate the Area Under the Curve
unlist(slot(performance(pred.detail,"auc"), "y.values"))#Unlist liberates the AUC value from the "performance" object created by ROCR
## [1] 0.8825125
#Now repeat this process with average assignment score and post test score.

# Model 2, I use # of messages, # of forum posts and pre test score.
c.tree2 <- rpart(level.up ~ av.assignment.score + post.test.score, method="class", data=D1, control=rpart.control(minsplit=2, minbucket=1))

printcp(c.tree2)
## 
## Classification tree:
## rpart(formula = level.up ~ av.assignment.score + post.test.score, 
##     data = D1, method = "class", control = rpart.control(minsplit = 2, 
##         minbucket = 1))
## 
## Variables actually used in tree construction:
## [1] av.assignment.score post.test.score    
## 
## Root node error: 400/1000 = 0.4
## 
## n= 1000 
## 
##     CP nsplit rel error xerror     xstd
## 1 0.93      0      1.00   1.00 0.038730
## 2 0.07      1      0.07   0.07 0.013042
## 3 0.01      2      0.00   0.00 0.000000
post(c.tree2, file = "tree2.ps", title = "Level up: 1 - YES, 0 - NO")

D1$pred2 <- predict(c.tree2, type = "prob")[,2]

pred2.detail <- prediction(D1$pred2, D1$level.up) 
plot(performance(pred2.detail, "tpr", "fpr"))
abline(0, 1, lty = 2)

Which one do you think was the better model? Why?

unlist(slot(performance(pred.detail,"auc"), "y.values"))
## [1] 0.8825125
unlist(slot(performance(pred2.detail,"auc"), "y.values"))
## [1] 1

Therefore, model 2 is better since the area under the ROC curve is larger (1, which indicates an excellent model, compared to model 1, 0.8825, which indicates a good model) and the turning point is sharper.

Thresholds

#Look at the ROC plot for your first model. Based on this plot choose a probability threshold that balances capturing the most correct predictions against false positives.

# However, if ROC plot only shows the true postive rate and false positive rate, but not the associated probability, some extra work must be done:
p1 = ggplot(D1, aes(x=pred)) + geom_histogram(binwidth=0.1,colour="white")
ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Actually, only 4 values of probabilities exist in D1$pred and each generate a section of ROC curve from bottom left to top right. Therefore, I choose 0.5 as the thredhold.

# Then generate a new variable in your data set that classifies each student according to your chosen threshold.

threshold.pred1 <- 0.5

Another ways to estimate the threshold

The first one uses the “cost” function in ROCR (https://www.r-bloggers.com/a-small-introduction-to-the-rocr-package/), which finds the closest point to tpr=1 and fpr=0 (the top left corner). But it gave a wrong value here.

The second one prints the content of the prediction object (https://stackoverflow.com/questions/16347507/obtaining-threshold-values-from-a-roc-curve) including all the cutoff points with associated fpr and tpr. So we can choose the cutoff point with eyes.

# example of the second solution:
perf = performance(pred.detail, "tpr", "fpr")
cutoffs <- data.frame(cut=perf@alpha.values[[1]], fpr=perf@x.values[[1]], tpr=perf@y.values[[1]])
cutoffs
##          cut       fpr    tpr
## 1        Inf 0.0000000 0.0000
## 2 0.84400000 0.0650000 0.5275
## 3 0.61250000 0.2200000 0.8950
## 4 0.40425532 0.2666667 0.9425
## 5 0.04967603 1.0000000 1.0000

Now generate three diagnostics:

D1$pred_int = 0
D1$pred_int[D1$pred >= threshold.pred1] = 1

table1 = table(D1$pred_int, D1$level.up)

# Accuracy = correct predictions / total predictions
D1$accuracy.model1 <- (table1[1,1]+table1[2,2])/1000
unique(D1$accuracy.model1)
## [1] 0.826
# Precision = True Pos. / (True Pos. + False Pos.)
D1$precision.model1 <- table1[2,2]/(table1[2,2]+table1[2,1])
unique(D1$precision.model1)
## [1] 0.7306122
# Recall = True Pos. / (True Pos. + False Neg.)
D1$recall.model1 <- table1[2,2]/(table1[2,2]+table1[1,2])
unique(D1$recall.model1)
## [1] 0.895
#Finally, calculate Kappa for your model according to:
table1 <- table(D1$level.up, D1$pred_int)
#Convert to matrix
matrix1 <- as.matrix(table1)

#Calculate kappa
kappa(matrix1, exact = TRUE)/kappa(matrix1)
## [1] 0.9944954

Now choose a different threshold (0.7) value and repeat these diagnostics.

threshod2 = 0.7

D1$pred_int_2 = 0
D1$pred_int_2[D1$pred >= threshod2] = 1

table2 = table(D1$pred_int_2, D1$level.up)

D1$accuracy.model2 <- (table2[1,1]+table2[2,2])/1000
unique(D1$accuracy.model2)
## [1] 0.772
D1$precision.model2 <- table2[2,2]/(table2[2,2]+table2[2,1])
unique(D1$precision.model2)
## [1] 0.844
D1$recall.model2 <- table2[2,2]/(table2[2,2]+table2[1,2])
unique(D1$recall.model2)
## [1] 0.5275
#Finally, calculate Kappa for your model according to:
table2 <- table(D1$level.up, D1$pred_int_2)
#Convert to matrix
matrix2 <- as.matrix(table2)

#Calculate kappa
kappa(matrix2, exact = TRUE)/kappa(matrix2)
## [1] 1.040758

According to the result:

1, Accuracy and recall decrease from model 1 to model 2 (threshold 1 to threshold 2), while precision increase. That means threshold 2 gives a more “useful” model, while not necessarily represent the whole picture of the data.

2, Cohen’s Kappa increases from threshold 1 to threshold 2, which indicates more model reliability.

3, And threshold 2 also gives higher true positive rate as well as higher false positive rate.